#### Convert Royal_Society_Corpus_v2.0.2_final.vrt file to txts ####


import os
import pandas as pd
from collections import defaultdict
from string import punctuation

path = '/Users/sunyambagga/Desktop/txtLAB-2/detecting-narrativity/data/'

with open(path+'historical-corpora/Royal_Society_Corpus_v2.0.2_final.vrt', 'r') as f:
    all_lines = f.readlines()



titles_covered = []
map_fname_words = defaultdict(list) # filename is text_id + '____' + year + '.txt'
passage_started = False

for index, line in enumerate(all_lines):
    if line.startswith('<text id=') and passage_started:
        passage_started = False # end this one so we can begin processing another passage
    
    if line.startswith('<text id=') and not passage_started: # get the ID and title
        ID = line.split('<text id="')[1].split('" ')[0]
        title = line.split('title="')[1].split('" ')[0]
        year = line.split('year="')[1].split('" ')[0]
        titles_covered.append(title.lower())
        filename = ID + '____' + year + '.txt'
        passage_started = True
    
    if not line.startswith('<'):
        map_fname_words[filename].append((line.split('\t')[0]))

print("Total lines in .vrt file:", len(all_lines))
print("Total Filenames: {} | Total Titles: {}".format(len(map_fname_words), len(titles_covered)))

for fname in map_fname_words:
    text = ''
    for word in map_fname_words[fname]:
        if word in '()[]':
            text += ' ' + word
        elif word in punctuation:
            text += word
        else:
            text += ' ' + word

    with open(path+'historical-corpora/royal-society-corpus-txts/'+fname, 'w') as F:
        F.write(text.strip())
